Title: WIP

Download Raw PBMC data from the 10X website

# BiocManager::install("BiocFileCache")
library(BiocFileCache)
bfc <- BiocFileCache("raw_data", ask = FALSE)
raw.path <- bfcrpath(bfc, file.path("http://cf.10xgenomics.com/samples",
    "cell-exp/2.1.0/pbmc4k/pbmc4k_raw_gene_bc_matrices.tar.gz"))
untar(raw.path, exdir=file.path(tempdir(), "pbmc4k"))
# BiocManager::install("DropletUtils")

Setup

Here, we will do some preparation of the dataset by:

  • Load in the data by importing into R as a SingleCellExperiment object
  • Perform QC selection on our cells
  • Run dimensionality reduction and clustering

Import as a SingleCellExperiment

SingleCellExperiment is [Description]

library(DropletUtils)
fname <- file.path(tempdir(), "pbmc4k/raw_gene_bc_matrices/GRCh38")
sce.pbmc <- read10xCounts(fname, col.names=TRUE)

Annotate genes

Sets the with SYMBOL names rather than the Ensembl Gene ID thwy start as, and add mapping information

# BiocManager::install("scater")
library(scater)
rownames(sce.pbmc) <- uniquifyFeatureNames(rowData(sce.pbmc)$ID, rowData(sce.pbmc)$Symbol)
# BiocManager::install("EnsDb.Hsapiens.v86")
library(EnsDb.Hsapiens.v86)
location <- mapIds(EnsDb.Hsapiens.v86, keys=rowData(sce.pbmc)$ID,
    column="SEQNAME", keytype="GENEID")

Distinguish High Quality Cells

[Description]

set.seed(100)
e.out <- emptyDrops(counts(sce.pbmc))
sce.pbmc <- sce.pbmc[,which(e.out$FDR <= 0.001)]
# Calculate QC statistics
sce.pbmc <- calculateQCMetrics(sce.pbmc, feature_controls=list(Mito=which(location=="MT")))
# Determine outliers for total reads percent.mito per cell (makes a logical the length of our #cells)
high.mito <- isOutlier(sce.pbmc$pct_counts_Mito, nmads=3, type="higher")
# Remove cells with high mitochondrial read percentages
sce.pbmc <- sce.pbmc[,!high.mito]

We arrive at 3922 quality cells.

Normalize cells

# BiocManager::install("scran")
library(scran)
set.seed(1000)
clusters <- quickCluster(sce.pbmc)
sce.pbmc <- computeSumFactors(sce.pbmc, min.mean=0.1, cluster=clusters)
sce.pbmc <- normalize(sce.pbmc)

Run Dimensionality Reduction: PCA, TSNE, and UMAP

First, select features.

fit.pbmc <- trendVar(sce.pbmc, use.spikes=FALSE)
dec.pbmc <- decomposeVar(fit=fit.pbmc)
o <- order(dec.pbmc$bio, decreasing=TRUE)
chosen.hvgs <- rownames(dec.pbmc)[head(o, 2000)]

Then, run dimensionality reductions.

set.seed(10000)
sce.pbmc <- runPCA(sce.pbmc, feature_set=chosen.hvgs, ncomponents=25,
    BSPARAM=BiocSingular::IrlbaParam())
set.seed(100000)
# BiocManager::install("Rtsne")
# install.packages("Rtsne")
sce.pbmc <- runTSNE(sce.pbmc, use_dimred="PCA")
set.seed(1000000)
# BiocManager::install("uwot")
# install.packages("uwot")
sce.pbmc <- runUMAP(sce.pbmc, use_dimred="PCA")

Run Clustering

g <- buildSNNGraph(sce.pbmc, k=10, use.dimred = 'PCA')
clust <- igraph::cluster_walktrap(g)$membership
sce.pbmc$cluster <- factor(clust)

Run SingleR

Before we start, what do we have?

library(DittoSeq)
DEFAULT <- "sce.pbmc"
DBDimPlot("cluster", do.letter = FALSE)

First, we need to load in reference datasets. This sis human data, and there are two included in the SingleR package that we can use. We can use the code below to download (and cache them for future use).

library(SingleR)
blueprint.encode <- BlueprintEncodeData()
hpca <- HumanPrimaryCellAtlasData()

Now we can compare our cells to the references

#Run SingleR
singler.results.fine <- SingleR(test=sce.pbmc, ref=blueprint.encode, labels=blueprint.encode$label.fine, genes="de")
singler.results.main <- SingleR(test=sce.pbmc, ref=blueprint.encode, labels=blueprint.encode$label.main, genes="de")

Now if we want to check the scoring of an individual cell, we can compare its transcriptome to that of the database. The first cell was labeled as Monocytes (singler.results.fine$labels[1]).

keep <- !pruneScores(scores = singler.results.fine$scores, min.diff = 0.1, nmads = 3)
DBDimPlot(singler.results.fine$labels, do.letter = FALSE)

DBDimPlot(singler.results.fine$labels, do.letter = FALSE,
    cells.use = keep,
    colors = c(1:24)[levels(as.factor(singler.results.fine$labels)) %in%
levels(as.factor(singler.results.fine$labels[keep]))])

plotScoreHeatmap(results = singler.results.fine, max.labels = 10,
                 clusters = singler.results.fine$labels, show.pruned = TRUE,
                 prune.calls = as.character(!keep))

keep <- !pruneScores(scores = singler.results.main$scores, min.diff = 0.1, nmads = 3)
DBDimPlot(singler.results.main$labels, do.letter = FALSE)

DBDimPlot(singler.results.main$labels, do.letter = FALSE,
    cells.use = keep,
    colors = c(1:24)[levels(as.factor(singler.results.main$labels)) %in%
levels(as.factor(singler.results.main$labels[keep]))])

plotScoreHeatmap(results = singler.results.main, max.labels = 10,
                 clusters = singler.results.main$labels, show.pruned = TRUE,
                 prune.calls = as.character(!keep))

# Dan's scoring mod
pruneScores <- function(scores, min.diff.vs.median = 0.05, nmads=3, min.percent.diff.vs.next = 0.05) {
    if (is(scores, "DataFrame")){
      scores <- scores$scores
    }

    maxed <- rowMaxs(DelayedArray(scores))
    delta <- maxed - rowMedians(scores)
    keep <- delta >= min.diff.vs.median
    keep <- keep & (rowSums(scores/maxed > (1-min.percent.diff.vs.next)) < 2)

    best <- max.col(scores)
    by.label <- split(which(keep), best[keep])

    for (l in by.label) {
        current <- maxed[l]
        med <- median(current)
        MAD <- mad(current, center=med)
        keep[l] <- (current >= med - nmads * MAD)
    }

    !keep
}
keep <- !pruneScores(scores = singler.results.fine$scores, min.diff.vs.median = 0.1, min.percent.diff.vs.next = 0.01, nmads = 3)
DBDimPlot(singler.results.fine$labels, do.letter = FALSE)

DBDimPlot(singler.results.fine$labels, do.letter = FALSE,
    cells.use = keep,
    colors = c(1:24)[levels(as.factor(singler.results.fine$labels)) %in%
levels(as.factor(singler.results.fine$labels[keep]))])

plotScoreHeatmap(results = singler.results.fine, max.labels = 10,
                 clusters = singler.results.fine$labels, show.pruned = TRUE,
                 prune.calls = as.character(!keep))

keep <- !pruneScores(scores = singler.results.main$scores, min.diff.vs.median = 0.1, min.percent.diff.vs.next = 0.01, nmads = 3)
DBDimPlot(singler.results.main$labels, do.letter = FALSE)

DBDimPlot(singler.results.main$labels, do.letter = FALSE,
    cells.use = keep,
    colors = c(1:24)[levels(as.factor(singler.results.main$labels)) %in%
levels(as.factor(singler.results.main$labels[keep]))])

plotScoreHeatmap(results = singler.results.main, max.labels = 10,
                 clusters = singler.results.main$labels, show.pruned = TRUE,
                 prune.calls = as.character(!keep))